This project involves the analysis and clustering of a wine dataset adopted from Kaggle, containing 13 variables including alcohol, malic acid, and other chemical properties of wines. The dataset does not include any outcome variable, making it ideal for unsupervised learning techniques. The primary goal of this project is to explore the underlying patterns within the data by applying clustering algorithms such as k-means and hierarchical clustering. Additionally, Principal Component Analysis (PCA) will be performed to reduce the dimensionality of the data and to compare the performance and insights derived from these clustering methods.
# Load the libraries
library(tidyverse)
library(tidymodels)
library(tidyclust)
# Load the data
wine <- read_csv("wine-clustering.csv")
glimpse(wine)
## Rows: 178
## Columns: 13
## $ Alcohol <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.39, …
## $ Malic_Acid <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15, 1…
## $ Ash <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61, 2…
## $ Ash_Alcanity <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6, 1…
## $ Magnesium <dbl> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98, 10…
## $ Total_Phenols <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60, 2…
## $ Flavanoids <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51, 2…
## $ Nonflavanoid_Phenols <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31, 0…
## $ Proanthocyanins <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25, 1…
## $ Color_Intensity <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05, 5…
## $ Hue <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06, 1…
## $ OD280 <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58, 2…
## $ Proline <dbl> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295, 10…
EDA
library(skimr)
skim(wine)
# There are no missing values in the data set
Distribution of different variables
wine %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
ggplot(aes(Value, fill = Variable)) +
geom_histogram() +
facet_wrap(~Variable, scales = "free_x")
# Majority of the variables are right skewed
Correlation analysis
library(ggcorrplot)
cor_matrix <- cor(wine, use = "complete.obs")
ggcorrplot(cor_matrix, method = "square",
type = "full", lab = TRUE, lab_size = 3, tl.cex = 10)
There are a few highly correlated variables, for example; alcohol and proline , Flavanoids and OD280
Train and test sets
set.seed(123)
wine_splits <- initial_split(wine, prop = 0.75)
wine_train <- training(wine_splits)
wine_test <- testing(wine_splits)
Recipes
basic_rec <- recipe(~., data = wine_train) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
prep(basic_rec)
# Add a principal component analysis step because of the highly correlated variables
pca_rec <- basic_rec %>%
step_pca(all_numeric_predictors(), num_comp = 4)
Model Specification
kmeans_spec <- k_means(num_clusters = tune()) %>%
set_engine("stats")
Workflow
kmeans_wf <-workflow() %>%
add_recipe(basic_rec) %>%
add_model(kmeans_spec)
Tuning
# First create cross validation folds
set.seed(4567)
wine_folds <- vfold_cv(wine_train, v = 10)
# Create a grid
grid <- tibble(num_clusters = 1:10)
wine_res <- tune_cluster(
kmeans_wf,
resamples = wine_folds,
grid = grid,
metrics = cluster_metric_set(silhouette_avg)
)
wine_res
## # Tuning results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [119/14]> Fold01 <tibble [10 × 5]> <tibble [0 × 3]>
## 2 <split [119/14]> Fold02 <tibble [10 × 5]> <tibble [0 × 3]>
## 3 <split [119/14]> Fold03 <tibble [10 × 5]> <tibble [0 × 3]>
## 4 <split [120/13]> Fold04 <tibble [10 × 5]> <tibble [0 × 3]>
## 5 <split [120/13]> Fold05 <tibble [10 × 5]> <tibble [0 × 3]>
## 6 <split [120/13]> Fold06 <tibble [10 × 5]> <tibble [0 × 3]>
## 7 <split [120/13]> Fold07 <tibble [10 × 5]> <tibble [0 × 3]>
## 8 <split [120/13]> Fold08 <tibble [10 × 5]> <tibble [0 × 3]>
## 9 <split [120/13]> Fold09 <tibble [10 × 5]> <tibble [0 × 3]>
## 10 <split [120/13]> Fold10 <tibble [10 × 5]> <tibble [0 × 3]>
wine_res %>%
collect_metrics()
## # A tibble: 10 × 7
## num_clusters .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 silhouette_avg standard NaN 0 NA Preprocessor1_…
## 2 2 silhouette_avg standard 0.266 10 0.00242 Preprocessor1_…
## 3 3 silhouette_avg standard 0.280 10 0.00210 Preprocessor1_…
## 4 4 silhouette_avg standard 0.258 10 0.00555 Preprocessor1_…
## 5 5 silhouette_avg standard 0.219 10 0.00851 Preprocessor1_…
## 6 6 silhouette_avg standard 0.198 10 0.00776 Preprocessor1_…
## 7 7 silhouette_avg standard 0.172 10 0.0101 Preprocessor1_…
## 8 8 silhouette_avg standard 0.169 10 0.00561 Preprocessor1_…
## 9 9 silhouette_avg standard 0.163 10 0.00755 Preprocessor1_…
## 10 10 silhouette_avg standard 0.155 10 0.00344 Preprocessor1_…
# Visualizing the results
wine_res %>%
collect_metrics() %>%
select(num_clusters, mean) %>%
ggplot(aes(x = num_clusters, y = mean)) +
geom_point() +
geom_line() +
theme_minimal() +
ylab("mean silhoutte score") +
xlab("Number of clusters") +
scale_x_continuous(breaks = 1:10)
Based on the tuning results, the highest silhouette average obtained is 0.28, which corresponds to 3 clusters, indicating that 3 is the optimal number of clusters for the model.
# Select the best cluster and finalize the workflow
best_cluster <- tibble(num_clusters = 3)
wines_final_wf <- finalize_workflow_tidyclust(kmeans_wf, best_cluster)
wines_final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_zv()
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K Means Cluster Specification (partition)
##
## Main Arguments:
## num_clusters = 3
##
## Computational engine: stats
# Fit the model
wines_fit <- fit(wines_final_wf, data = wine_train)
wines_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_zv()
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 3 clusters of sizes 38, 50, 45
##
## Cluster means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.1161905 0.8995691 0.2483876 0.58560917 -0.06356151 -0.9147589
## 2 0.7584965 -0.3156888 0.3242022 -0.53313502 0.53103730 0.9118989
## 3 -0.9408903 -0.4088708 -0.5699742 0.09785784 -0.53636728 -0.2407579
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 -1.19945157 0.75676445 -0.6915342 0.97607424 -1.1570383
## 2 0.97796390 -0.65445335 0.6213731 0.07029791 0.5518514
## 3 -0.07375634 0.08812485 -0.1064523 -0.90234926 0.3638863
## OD280 Proline
## 1 -1.2471254 -0.3467537
## 2 0.7745446 1.0042800
## 3 0.1925231 -0.8230524
##
## Clustering vector:
## [1] 1 2 1 2 3 2 1 1 1 3 3 1 3 1 2 2 2 2 1 3 3 1 3 3 3 1 2 3 1 1 2 2 2 1 1 2 3
## [38] 1 1 3 3 1 2 2 2 1 3 2 3 3 2 3 1 3 3 2 2 3 1 1 1 3 1 2 2 1 2 2 2 3 1 3 3 2
## [75] 2 2 2 2 1 3 1 3 2 3 3 1 3 3 2 2 2 1 3 3 1 2 2 3 3 2 3 2 1 1 1 3 1 2 2 2 1
## [112] 3 3 1 2 2 2 2 3 2 3 3 1 1 2 3 3 2 3 2 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 237.4795 347.6556 359.7687
## (between_SS / total_SS = 44.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
extract_cluster_assignment(wines_fit) %>% table() # returns the cluster assignments of the training observations
## .cluster
## Cluster_1 Cluster_2 Cluster_3
## 38 50 45
Most observations are in the second cluster
extract_centroids(wines_fit) #returns the location of the centroids
## # A tibble: 3 × 14
## .cluster Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Cluster_1 0.116 0.900 0.248 0.586 -0.0636 -0.915
## 2 Cluster_2 0.758 -0.316 0.324 -0.533 0.531 0.912
## 3 Cluster_3 -0.941 -0.409 -0.570 0.0979 -0.536 -0.241
## # ℹ 7 more variables: Flavanoids <dbl>, Nonflavanoid_Phenols <dbl>,
## # Proanthocyanins <dbl>, Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>,
## # Proline <dbl>
Visualize the clusters
library(factoextra)
# fviz_cluster() doesn't accept a workflow object, thus the need to extract the fitted k-means model from the workflow object before passing it to fviz_cluster()
# extract_fit_parsnip - Extracts the fitted k-means model from the workflow object.
kmeans_model <- extract_fit_parsnip(wines_fit)$fit
fviz_cluster(kmeans_model, data = prep(basic_rec) %>% bake(new_data = NULL),
geom = "point",
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
An interactive visualization of clusters in the wine dataset using the variables Alcohol and Malic_Acid.
p <- wine_train %>%
bind_cols(extract_cluster_assignment(wines_fit), .) %>%
ggplot(aes(Alcohol, Malic_Acid)) +
geom_point(aes(fill = .cluster),
shape = 21,
alpha = 0.5,
size = 5) +
geom_smooth(color = "blue", se = FALSE) +
scale_y_log10() +
scale_x_log10()
library(plotly)
ggplotly(p)
To make predictions using the fitted model, use predict()
# Making some predictions
predict(wines_fit, new_data = slice_sample(wine_train, n = 10)) # returns the cluster a new observation belongs to
## # A tibble: 10 × 1
## .pred_cluster
## <fct>
## 1 Cluster_1
## 2 Cluster_2
## 3 Cluster_1
## 4 Cluster_1
## 5 Cluster_2
## 6 Cluster_2
## 7 Cluster_3
## 8 Cluster_3
## 9 Cluster_1
## 10 Cluster_2
# Fit the model on the test data to assess performance on new data
wines_pred <- predict(wines_fit, new_data = wine_test)
wines_pred
## # A tibble: 45 × 1
## .pred_cluster
## <fct>
## 1 Cluster_2
## 2 Cluster_2
## 3 Cluster_2
## 4 Cluster_2
## 5 Cluster_2
## 6 Cluster_2
## 7 Cluster_2
## 8 Cluster_2
## 9 Cluster_2
## 10 Cluster_2
## # ℹ 35 more rows
Model specification
hc_spec <- hier_clust(num_clusters = tune(),
linkage_method = "ward.D2") %>%
set_engine("stats")
hc_spec
## Hierarchical Clustering Specification (partition)
##
## Main Arguments:
## num_clusters = tune()
## linkage_method = ward.D2
##
## Computational engine: stats
Workflow
hc_wf <- workflow() %>%
add_recipe(basic_rec) %>%
add_model(hc_spec)
hc_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: hier_clust()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_zv()
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Hierarchical Clustering Specification (partition)
##
## Main Arguments:
## num_clusters = tune()
## linkage_method = ward.D2
##
## Computational engine: stats
Model tuning
set.seed(4566)
hc_res <- tune_cluster(
hc_wf,
resamples = wine_folds,
grid = grid,
metrics = cluster_metric_set(silhouette_avg)
)
hc_res
## # Tuning results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [119/14]> Fold01 <tibble [10 × 5]> <tibble [0 × 3]>
## 2 <split [119/14]> Fold02 <tibble [10 × 5]> <tibble [0 × 3]>
## 3 <split [119/14]> Fold03 <tibble [10 × 5]> <tibble [0 × 3]>
## 4 <split [120/13]> Fold04 <tibble [10 × 5]> <tibble [0 × 3]>
## 5 <split [120/13]> Fold05 <tibble [10 × 5]> <tibble [0 × 3]>
## 6 <split [120/13]> Fold06 <tibble [10 × 5]> <tibble [0 × 3]>
## 7 <split [120/13]> Fold07 <tibble [10 × 5]> <tibble [0 × 3]>
## 8 <split [120/13]> Fold08 <tibble [10 × 5]> <tibble [0 × 3]>
## 9 <split [120/13]> Fold09 <tibble [10 × 5]> <tibble [0 × 3]>
## 10 <split [120/13]> Fold10 <tibble [10 × 5]> <tibble [0 × 3]>
hc_res %>%
collect_metrics()
## # A tibble: 10 × 7
## num_clusters .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 silhouette_avg standard NaN 0 NA Preprocessor…
## 2 2 silhouette_avg standard 0.241 10 0.00704 Preprocessor…
## 3 3 silhouette_avg standard 0.275 10 0.00274 Preprocessor…
## 4 4 silhouette_avg standard -0.00901 10 0.0444 Preprocessor…
## 5 5 silhouette_avg standard -0.0797 10 0.0302 Preprocessor…
## 6 6 silhouette_avg standard -0.107 10 0.0229 Preprocessor…
## 7 7 silhouette_avg standard -0.0626 10 0.0171 Preprocessor…
## 8 8 silhouette_avg standard -0.00273 10 0.0136 Preprocessor…
## 9 9 silhouette_avg standard -0.0270 10 0.0192 Preprocessor…
## 10 10 silhouette_avg standard -0.0504 10 0.0226 Preprocessor…
# Visualizing the result
hc_res %>%
collect_metrics() %>%
ggplot(aes(num_clusters, mean)) +
geom_point() +
geom_line() +
theme_minimal() +
ylab("mean silhoutte score") +
xlab("Number of clusters") +
scale_x_continuous(breaks = 1:10)
After fitting the hierarchical model, the silhouette average score obtained was 0.27, corresponding to 3 clusters. This is consistent with the k-means model, which also identified 3 clusters as optimal.
# Finalize the workflow and fit the model
best_cluster <- tibble(num_clusters = 3)
hc_final <- finalize_workflow_tidyclust(hc_wf, best_cluster)
hc_fit <- fit(hc_final, data = wine_train)
hc_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: hier_clust()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_zv()
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::hclust(d = stats::as.dist(dmat), method = linkage_method)
##
## Cluster method : ward.D2
## Number of objects: 133
# Cluster assignments
extract_cluster_assignment(hc_fit) %>%
table()
## .cluster
## Cluster_1 Cluster_2 Cluster_3
## 36 42 55
Plot the dendrogram
hc_model <- extract_fit_parsnip(hc_fit)$fit
library(dendextend)
fviz_dend(hc_model, k = 3, horiz = TRUE,
rect = TRUE, rect_fill = TRUE, rect_border = "jco",
k_colors = "jco", cex = 0.1)
Basic Recipe
basic_rec <- recipe(~., data = wine_train) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
wine_processed <- prep(basic_rec) %>% bake(new_data = wine_test)
wine_processed
## # A tibble: 45 × 13
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.46 -0.569 0.164 -1.19 1.85 0.806 1.02
## 2 0.190 -0.507 -0.908 -2.49 0.0204 0.571 0.731
## 3 0.141 0.00733 1.05 -0.301 0.0883 0.806 1.20
## 4 1.00 -0.887 -0.428 -1.07 -0.115 1.09 1.11
## 5 1.65 -0.427 -0.0211 -2.26 0.156 1.59 1.58
## 6 0.967 -0.693 0.866 0.114 1.04 1.04 1.35
## 7 0.733 0.663 0.644 -1.31 1.11 0.650 0.993
## 8 0.314 -0.560 -0.908 -0.776 -0.387 0.179 0.178
## 9 1.02 -0.400 1.53 -0.0642 0.496 1.04 0.934
## 10 0.782 -0.462 -0.0951 -0.716 0.292 0.210 0.663
## # ℹ 35 more rows
## # ℹ 6 more variables: Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>,
## # Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>, Proline <dbl>
# Function that helps in implementing PCA
library(ggforce)
plot_test2 <- function(recipe, dat = wine_test){
recipe %>%
prep() %>%
bake(new_data = dat) %>%
ggplot() +
geom_autopoint(aes(color = "steelblue"), alpha = 0.5, size = 1) +
geom_autodensity(alpha = 0.3) +
facet_matrix(rows = vars(PC1, PC2, PC3,PC4), layer.diag = 2) # paste0("PC",1:4)
}
# PCA
basic_rec %>%
step_pca(all_numeric_predictors(), num_comp = 4) %>%
plot_test2() +
theme_bw()
PC1 and PC2 captures small clusters in the data, this indicates that the data points in the dataset might share some similarities
# What features are driving performance?
basic_rec %>%
step_pca(all_numeric_predictors(), num_comp = 4) %>%
prep() %>%
tidy(number = 3) %>%
filter(component %in% paste0("PC",1:4)) %>%
group_by(component) %>%
slice_max(abs(value), n = 5) %>%
ungroup() %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col(position = "dodge", alpha = 0.7) +
facet_wrap(~component, scales = "free_y") +
labs(x = "Contribution to Principal Component", y = NULL, fill = "Positive?") +
theme_bw()
# Fit the model, including step_pca
pca_rec <- basic_rec %>%
step_pca(all_numeric_predictors(), num_comp = 4)
# Workflow
kmeans_wf_pca <- workflow() %>%
add_recipe(pca_rec) %>%
add_model(kmeans_spec)
# Tuning
set.seed(3456)
wine_res_pca <- tune_cluster(
kmeans_wf_pca,
resamples = wine_folds,
grid = grid,
metrics = cluster_metric_set(silhouette_avg))
wine_res_pca
## # Tuning results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [119/14]> Fold01 <tibble [10 × 5]> <tibble [0 × 3]>
## 2 <split [119/14]> Fold02 <tibble [10 × 5]> <tibble [0 × 3]>
## 3 <split [119/14]> Fold03 <tibble [10 × 5]> <tibble [0 × 3]>
## 4 <split [120/13]> Fold04 <tibble [10 × 5]> <tibble [0 × 3]>
## 5 <split [120/13]> Fold05 <tibble [10 × 5]> <tibble [0 × 3]>
## 6 <split [120/13]> Fold06 <tibble [10 × 5]> <tibble [0 × 3]>
## 7 <split [120/13]> Fold07 <tibble [10 × 5]> <tibble [0 × 3]>
## 8 <split [120/13]> Fold08 <tibble [10 × 5]> <tibble [0 × 3]>
## 9 <split [120/13]> Fold09 <tibble [10 × 5]> <tibble [0 × 3]>
## 10 <split [120/13]> Fold10 <tibble [10 × 5]> <tibble [0 × 3]>
wine_res_pca %>%
collect_metrics()
## # A tibble: 10 × 7
## num_clusters .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 silhouette_avg standard NaN 0 NA Preprocessor1_…
## 2 2 silhouette_avg standard 0.342 10 0.0152 Preprocessor1_…
## 3 3 silhouette_avg standard 0.402 10 0.00249 Preprocessor1_…
## 4 4 silhouette_avg standard 0.351 10 0.00541 Preprocessor1_…
## 5 5 silhouette_avg standard 0.333 10 0.0118 Preprocessor1_…
## 6 6 silhouette_avg standard 0.295 10 0.00907 Preprocessor1_…
## 7 7 silhouette_avg standard 0.285 10 0.00640 Preprocessor1_…
## 8 8 silhouette_avg standard 0.264 10 0.00819 Preprocessor1_…
## 9 9 silhouette_avg standard 0.271 10 0.00318 Preprocessor1_…
## 10 10 silhouette_avg standard 0.253 10 0.00688 Preprocessor1_…
# Visualizing the results
wine_res_pca %>%
collect_metrics() %>%
select(num_clusters, mean) %>%
ggplot(aes(x = num_clusters, y = mean)) +
geom_point() +
geom_line() +
theme_minimal() +
ylab("mean silhoutte score") +
xlab("Number of clusters") +
scale_x_continuous(breaks = 1:10)
After adding a PCA step to the k-means model, the silhouette average score improved to 0.40, with 3 still being the optimal number of clusters.
wines_final_wf_pca <- finalize_workflow_tidyclust(kmeans_wf_pca, best_cluster)
wines_final_wf_pca
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_zv()
## • step_normalize()
## • step_pca()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K Means Cluster Specification (partition)
##
## Main Arguments:
## num_clusters = 3
##
## Computational engine: stats
# Fit the model
wines_fit_pca <- fit(wines_final_wf_pca, data = wine_train)
wines_fit_pca
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_zv()
## • step_normalize()
## • step_pca()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 3 clusters of sizes 38, 52, 43
##
## Cluster means:
## PC1 PC2 PC3 PC4
## 2 2.6442278 -1.2106895 -0.157634206 0.07725296
## 1 -2.2816279 -0.6278453 -0.004578984 -0.07358049
## 3 0.4224185 1.8291664 0.144842024 0.02071100
##
## Clustering vector:
## [1] 1 2 1 2 3 2 1 1 1 3 3 1 3 1 2 2 2 2 1 3 3 1 3 3 3 1 2 3 1 1 2 2 2 1 1 2 3
## [38] 1 1 3 3 1 2 2 2 1 3 2 3 3 2 3 1 3 3 2 2 3 1 1 1 3 1 2 2 1 2 2 2 3 1 3 3 2
## [75] 2 2 2 2 1 3 1 3 2 3 2 1 2 3 2 2 2 1 3 3 1 2 2 3 3 2 3 2 1 1 1 3 1 2 2 2 1
## [112] 3 3 1 2 2 2 2 3 2 3 3 1 1 2 3 3 2 3 2 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 104.8761 213.9112 188.4367
## (between_SS / total_SS = 60.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Visualize the clusters
kmeans_model_pca <- extract_fit_parsnip(wines_fit_pca)$fit
fviz_cluster(kmeans_model_pca, data = prep(pca_rec) %>% bake(new_data = NULL),
geom = "point",
ellipse.type = "circle",
palette = "jco",
ggtheme = theme_minimal())